For this assignment, we will look in more detail at the per digit accuracy.
The extra credit examines the issue of how much data does one need to do classification (at least in this simple case).
Here I want you to read in the digit data, and define two samples:
Remmeber to shuffle the data before doing the 90/10 split!
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
import pandas as pd
#
short = ""
#short = "short_"
data_location = '/fs/ess/PAS2038/PHYSICS5680_OSU/'
# Read in all of the other digits
dfCombined = pd.DataFrame()
for digit in range(10):
print("Processing digit ",digit)
fname = data_location + '/data/ch3/digit_' + short + str(digit) + '.csv'
df = pd.read_csv(fname,header=None)
df['digit'] = digit
dfCombined = pd.concat([dfCombined, df])
print("Length of sample: ",len(dfCombined))
Processing digit 0 Processing digit 1 Processing digit 2 Processing digit 3 Processing digit 4 Processing digit 5 Processing digit 6 Processing digit 7 Processing digit 8 Processing digit 9 Length of sample: 70000
from sklearn.utils import shuffle
dfCombinedShuffle = shuffle(dfCombined, random_state=42)
train_length = int(0.9*len(dfCombinedShuffle))
X = dfCombinedShuffle.iloc[:train_length,:784].to_numpy()
y = dfCombinedShuffle.iloc[:train_length,784].values
X_holdout = dfCombinedShuffle.iloc[train_length:,:784].to_numpy()
y_holdout = dfCombinedShuffle.iloc[train_length:,784].values
print(len(X),len(X_holdout))
63000 7000
Get the following methods:
from collections import defaultdict
from functools import partial
from itertools import repeat
def nested_defaultdict(default_factory, depth=1):
result = partial(defaultdict, default_factory)
for _ in repeat(None, depth - 1):
result = partial(defaultdict, result)
return result()
# Determine the performance
def multiPerformance(y,y_pred,y_score,debug=False):
#
# Make our matrix
confusionMatrix = nested_defaultdict(int,2)
classes = set()
totalTrue = defaultdict(int)
totalPred = defaultdict(int)
for i in range(len(y_pred)):
trueClass = y[i]
classes.add(trueClass)
predClass = y_pred[i]
totalTrue[trueClass] += 1
totalPred[predClass] += 1
confusionMatrix[trueClass][predClass] += 1
if debug:
for trueClass in classes:
print("True: ",trueClass,end="")
for predClass in classes:
print("\t",confusionMatrix[trueClass][predClass],end="")
print()
print()
#
#
# Overall accuracy - sum the diagonals and divide by total
accMicro = 0.0
accMacro = 0.0
for cl in classes:
accMicro += confusionMatrix[cl][cl]
accMacro += confusionMatrix[cl][cl]/totalTrue[cl]
accMicro /= len(y)
accMacro = accMacro / len(classes)
results = {"confusionMatrix":confusionMatrix,"accuracyMicro":accMicro,"accuracyMacro":accMacro}
return results
def runFitter(estimator,X_train,y_train,X_test,y_test,debug=False):
#
# Now fit to our training set
estimator.fit(X_train,y_train)
#
# Now predict the classes and get the score for our traing set
y_train_pred = estimator.predict(X_train)
y_train_score = estimator.decision_function(X_train) # NOTE: some estimators have a predict_prob method instead od descision_function
#
# Now predict the classes and get the score for our test set
y_test_pred = estimator.predict(X_test)
y_test_score = estimator.decision_function(X_test)
#
# Now get the performaance
results_test = multiPerformance(y_test,y_test_pred,y_test_score,debug=False)
results_train = multiPerformance(y_train,y_train_pred,y_train_score,debug=False)
#
# Decide what you want to return: for now, just precision, recall, and auc for both test and train
results = {
'cf_test':results_test['confusionMatrix'],
'cf_train':results_train['confusionMatrix'],
'accuracyMicro_test':results_test['accuracyMicro'],
'accuracyMacro_test':results_test['accuracyMacro'],
'accuracyMicro_train':results_train['accuracyMicro'],
'accuracyMacro_train':results_train['accuracyMacro'],
}
return results
Here we want to loop over the kfolds (just like we did in multiclassv2), but we want to calculate the average per digit accuracy over the folds.
So for each fold, you need to calculate the test accuracy for each digit (0,1,2,..9), and then get the average of these over the 5 folds. You can get the accuracy for each digit within each fold by using the cf_test confusion matrix:
Store these in a dictionary (with key the true class) called "accuracies_by_digit".
#
# Get our estimator and predict
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import numpy as np
kfolds = 5
skf = StratifiedKFold(n_splits=kfolds)
#skf = KFold(n_splits=kfolds)
#
estimator = LinearSVC(random_state=42,dual=False,max_iter=500,tol=0.01) # use dual=False when n_samples > n_features which is what we have
#estimator = SGDClassifier(random_state=42,max_iter=500,tol=0.01) # use dual=False when n_samples > n_features which is what we have
#
# Cresate some vars to keep track of everything
avg_accuracyMicro_test = 0.0
avg_accuracyMicro_train = 0.0
avg_accuracyMacro_test = 0.0
avg_accuracyMacro_train = 0.0
numSplits = 0.0
#
# Now loop
accuracies_by_digit = defaultdict(float)
fakeratio_by_digit = defaultdict(float)
for train_index, test_index in skf.split(X, y):
print("Training")
X_train = X[train_index]
y_train = y[train_index]
X_test = X[test_index]
y_test = y[test_index]
#
# Now fit to our training set
results = runFitter(estimator,X_train,y_train,X_test,y_test)
#
avg_accuracyMicro_test += results['accuracyMicro_test']
avg_accuracyMicro_train += results['accuracyMicro_train']
avg_accuracyMacro_test += results['accuracyMacro_test']
avg_accuracyMacro_train += results['accuracyMacro_train']
lastCF_train = results['cf_train']
lastCF_test = results['cf_test']
numSplits += 1.0
print(" Split ",numSplits,"; accuracyMicro test/train",results['accuracyMicro_test'],results['accuracyMicro_train'],"; accuracyMacro test/train",results['accuracyMacro_test'],results['accuracyMacro_train'])
#
for i in range(10):
if bool(accuracies_by_digit[f'Digit {i}']) == True: #this is to see if I have already gone through this at least once, thus they exist in the dictionary
total_accuracy = accuracies_by_digit[f'Digit {i}'] #if it exist, then there is a value in the dict already of the digit
classified_correct = results['cf_test'][i][i] #the diagonal values are the correctly classified values
total_true_digits = sum(results['cf_test'][i].values()) #summing the row obtains how many of said digit was actually in the set
total_accuracy += classified_correct/total_true_digits #calculates accuracy and adds it to the total accuracy thats in the dict which is later divided for the average
accuracies_by_digit.update({f'Digit {i}': total_accuracy}) #update value in dict of accuracy
if bool(fakeratio_by_digit[f'Digit {i}']) == True:#this is to see if I have already gone through this at least once, thus they exist in the dictionary
fakeratio = fakeratio_by_digit[f'Digit {i}'] #if it exist, then there is a value in the dict already of the digit
total_predicted_of_digit = 0
correct = 0
for j in range(10):
if i != j:
total_predicted_of_digit += results['cf_test'][j][i]
correct = results['cf_test'][i][i]
fakeratio += total_predicted_of_digit/correct
fakeratio_by_digit.update({f'Digit {i}': fakeratio})
if bool(accuracies_by_digit[f'Digit {i}']) == False: #this is for when we first go thru and the dict does not have anything for any digit
#print('i am in false if')
classified_correct = results['cf_test'][i][i]
total_true_digits = sum(results['cf_test'][i].values())
total_accuracy = classified_correct/total_true_digits
accuracies_by_digit.update({f'Digit {i}': total_accuracy})
if bool(fakeratio_by_digit[f'Digit {i}']) == False:
total_predicted_of_digit = 0
correct = 0
fakeratio = 0
for j in range(10):
if i != j:
total_predicted_of_digit += results['cf_test'][j][i]
correct = results['cf_test'][i][i]
fakeratio = total_predicted_of_digit/correct
fakeratio_by_digit.update({f'Digit {i}': fakeratio})
if numSplits == kfolds: #when this is for when we are in the last fold and thus we can now take the average
folds = float(kfolds)
total_accuracy = accuracies_by_digit[f'Digit {i}']/folds
accuracies_by_digit.update({f'Digit {i}': total_accuracy})
print(f'average accuracy for digit {i}: ',str(accuracies_by_digit[f'Digit {i}']))
total_fakeratio = fakeratio_by_digit[f'Digit {i}']/folds
fakeratio_by_digit.update({f'Digit {i}': total_fakeratio})
print(f'average fake ratio for digit {i}: ',str(fakeratio_by_digit[f'Digit {i}']))
avg_accuracyMicro_test /= numSplits
avg_accuracyMicro_train /= numSplits
avg_accuracyMacro_test /= numSplits
avg_accuracyMacro_train /= numSplits
# Now print
print("Average Micro Accuracy train/test ",round(avg_accuracyMicro_train,3),round(avg_accuracyMicro_test,3))
print("Average Macro Accuracy train/test ",round(avg_accuracyMacro_train,3),round(avg_accuracyMacro_test,3))
Training Split 1.0 ; accuracyMicro test/train 0.9144444444444444 0.920515873015873 ; accuracyMacro test/train 0.9131100406170398 0.9192442401168878 Training Split 2.0 ; accuracyMicro test/train 0.9088888888888889 0.9218650793650793 ; accuracyMacro test/train 0.9073044838691733 0.9206623704630179 Training Split 3.0 ; accuracyMicro test/train 0.9096031746031746 0.9234325396825397 ; accuracyMacro test/train 0.9081641731346879 0.9222573779820552 Training Split 4.0 ; accuracyMicro test/train 0.9106349206349207 0.9218055555555555 ; accuracyMacro test/train 0.9091721789224477 0.9205824820975961 Training Split 5.0 ; accuracyMicro test/train 0.9137301587301587 0.9203373015873015 ; accuracyMacro test/train 0.91241738920917 0.9190419330574272 average accuracy for digit 0: 0.9711411336806173 average fake ratio for digit 0: 0.055769766661742895 average accuracy for digit 1: 0.976053062561415 average fake ratio for digit 1: 0.053537458963305105 average accuracy for digit 2: 0.8823248407643313 average fake ratio for digit 2: 0.09563435058423912 average accuracy for digit 3: 0.8882944695517268 average fake ratio for digit 3: 0.11745276194618995 average accuracy for digit 4: 0.920893145126444 average fake ratio for digit 4: 0.0960240432324605 average accuracy for digit 5: 0.8408018318415161 average fake ratio for digit 5: 0.1294609324214388 average accuracy for digit 6: 0.9554260366833954 average fake ratio for digit 6: 0.0684583760018743 average accuracy for digit 7: 0.926049736386733 average fake ratio for digit 7: 0.06885281856100248 average accuracy for digit 8: 0.8528443384009231 average fake ratio for digit 8: 0.1723316647143363 average accuracy for digit 9: 0.8865079365079364 average fake ratio for digit 9: 0.1401572139170504 Average Micro Accuracy train/test 0.922 0.911 Average Macro Accuracy train/test 0.92 0.91
Now using the X and y samples and no k-folds, train with the full sample, using the X_holdout and y_holdout data as your "test" data. You can do this with a single call to runFitter.
Then calcaulate the per digit accuracy for this case (again you can use the confusion matrix from the holdout sample to calculate this).
Compare these accuracies to those obtained from the average of the k-folds, in the following way:
#
# Now fit to the full data
#
# Now fit to our training set
finalResults = runFitter(estimator,X,y,X_holdout,y_holdout)
allData_accuracies_by_digit = defaultdict(float)
# Get the confusion matrix for the test results and calulcate the per digit accuracy
print("Test confusion matrix of all data")
for trueClass in range(10):
print("True: ",trueClass,end="")
for predClass in range(10):
print("\t",finalResults['cf_test'][trueClass][predClass],end="")
print()
print()
print("Train confusion matrix")
for trueClass in range(10):
print("True: ",trueClass,end="")
for predClass in range(10):
print("\t",finalResults['cf_train'][trueClass][predClass],end="")
print()
print()
for i in range(10):
#print(f'Diagonal value: {lastCF_test[i][i]}')
#print(f'Sum: {sum(lastCF_test[i].values())}')
classified_correct = finalResults['cf_test'][i][i]
total_true_digits = sum(finalResults['cf_test'][i].values())
accuracy = classified_correct/total_true_digits
allData_accuracies_by_digit.update({f'Digit {i}': accuracy})
print(f'All Data accuracy for digit {i}: ',str(allData_accuracies_by_digit[f'Digit {i}']))
print('accuracyMicro for train set: ' + str(finalResults['accuracyMicro_train']))
print('accuracyMacro for train set: ' + str(finalResults['accuracyMacro_train']))
print('accuracyMicro for test set: ' + str(finalResults['accuracyMicro_test']))
print('accuracyMacro for test set: ' + str(finalResults['accuracyMacro_test']))
Test confusion matrix of all data True: 0 717 0 5 3 0 2 4 0 4 0 True: 1 0 753 5 4 1 3 4 1 2 5 True: 2 4 8 636 13 7 2 13 7 20 0 True: 3 4 6 23 664 2 21 3 10 18 7 True: 4 2 3 6 0 642 5 4 1 4 26 True: 5 5 6 4 23 7 518 15 2 31 11 True: 6 2 3 10 0 3 8 651 0 7 0 True: 7 3 6 5 7 5 1 0 650 8 36 True: 8 3 21 11 20 6 18 6 3 540 13 True: 9 2 2 3 13 17 4 1 22 6 588 Train confusion matrix True: 0 6024 1 12 8 11 21 39 6 39 7 True: 1 1 6946 35 7 6 19 8 9 60 8 True: 2 42 52 5637 88 70 20 65 72 213 21 True: 3 22 19 159 5724 9 148 27 57 146 72 True: 4 10 18 28 10 5730 10 41 12 63 209 True: 5 55 20 33 214 61 4873 115 19 213 88 True: 6 35 13 35 4 25 79 5955 1 42 3 True: 7 19 28 79 16 62 9 7 6133 19 200 True: 8 51 144 75 148 41 197 47 33 5343 105 True: 9 40 29 24 102 181 37 2 176 68 5641 All Data accuracy for digit 0: 0.9755102040816327 All Data accuracy for digit 1: 0.967866323907455 All Data accuracy for digit 2: 0.895774647887324 All Data accuracy for digit 3: 0.8759894459102903 All Data accuracy for digit 4: 0.9264069264069265 All Data accuracy for digit 5: 0.8327974276527331 All Data accuracy for digit 6: 0.9517543859649122 All Data accuracy for digit 7: 0.9015256588072122 All Data accuracy for digit 8: 0.8424336973478939 All Data accuracy for digit 9: 0.8936170212765957 accuracyMicro for train set: 0.9207301587301587 accuracyMacro for train set: 0.9194655773974233 accuracyMicro for test set: 0.9084285714285715 accuracyMacro for test set: 0.9063675739242976
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
allData = np.zeros(10)
averagekfoldData = np.zeros(10)
digits = np.arange(0,10,1)
for i in range(10):
allData[i] = allData_accuracies_by_digit[f'Digit {i}']
averagekfoldData[i] = str(accuracies_by_digit[f'Digit {i}'])
accuracydata = {'Digit':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
'Accuracy using all data': allData,
'Average Accuracy from all kfold runs': averagekfoldData}
accuracy_df = pd.DataFrame(accuracydata)
accuracy_df
| Digit | Accuracy using all data | Average Accuracy from all kfold runs | |
|---|---|---|---|
| 0 | 0 | 0.975510 | 0.971141 |
| 1 | 1 | 0.967866 | 0.976053 |
| 2 | 2 | 0.895775 | 0.882325 |
| 3 | 3 | 0.875989 | 0.888294 |
| 4 | 4 | 0.926407 | 0.920893 |
| 5 | 5 | 0.832797 | 0.840802 |
| 6 | 6 | 0.951754 | 0.955426 |
| 7 | 7 | 0.901526 | 0.926050 |
| 8 | 8 | 0.842434 | 0.852844 |
| 9 | 9 | 0.893617 | 0.886508 |
fig = px.scatter(accuracy_df,x='Accuracy using all data',y='Average Accuracy from all kfold runs',
color='Digit',title='Accuracy using all data vs Average Accuracy from all kfold runs')
fig.update_layout(xaxis = dict(tickmode = 'linear',dtick = 0.02))
fig.show()
First, print out the holdout confusion matrix from Task 3.
Note that the accuracy for each digit is the diagonal element for each row, divided by the sum of all of the numbers in that row.
What do the columns tell us? We will define the "fake ratio" as the sum of all of the non-diagonal terms in each column, divided by the diaginal element in this column.
Calculate this "fake ratio", and then:
# Get the confusion matrix for the test results and calulcate the per digit accuracy
print("Test confusion matrix of all data")
for trueClass in range(10):
print("True: ",trueClass,end="")
for predClass in range(10):
print("\t",finalResults['cf_test'][trueClass][predClass],end="")
print()
print()
Test confusion matrix of all data True: 0 717 0 5 3 0 2 4 0 4 0 True: 1 0 753 5 4 1 3 4 1 2 5 True: 2 4 8 636 13 7 2 13 7 20 0 True: 3 4 6 23 664 2 21 3 10 18 7 True: 4 2 3 6 0 642 5 4 1 4 26 True: 5 5 6 4 23 7 518 15 2 31 11 True: 6 2 3 10 0 3 8 651 0 7 0 True: 7 3 6 5 7 5 1 0 650 8 36 True: 8 3 21 11 20 6 18 6 3 540 13 True: 9 2 2 3 13 17 4 1 22 6 588
allData_ratios = np.zeros(10)
for i in range(10):
total_predicted_of_digit = 0
classified_correct = 0
for j in range(10):
if i != j:
total_predicted_of_digit += finalResults['cf_test'][j][i]
classified_correct = finalResults['cf_test'][i][i]
allData_ratios[i] = total_predicted_of_digit/classified_correct
averagekfoldData_ratios = np.zeros(10)
digits_ratios = np.arange(0,10,1)
for i in range(10):
averagekfoldData_ratios[i] = str(fakeratio_by_digit[f'Digit {i}'])
fakeratiodata = {'Digit':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
'Fake ratio using all data': allData_ratios,
'Average fake ratio from all kfold runs': averagekfoldData_ratios}
fakeratio_df = pd.DataFrame(fakeratiodata)
fakeratio_df
| Digit | Fake ratio using all data | Average fake ratio from all kfold runs | |
|---|---|---|---|
| 0 | 0 | 0.034868 | 0.055770 |
| 1 | 1 | 0.073041 | 0.053537 |
| 2 | 2 | 0.113208 | 0.095634 |
| 3 | 3 | 0.125000 | 0.117453 |
| 4 | 4 | 0.074766 | 0.096024 |
| 5 | 5 | 0.123552 | 0.129461 |
| 6 | 6 | 0.076805 | 0.068458 |
| 7 | 7 | 0.070769 | 0.068853 |
| 8 | 8 | 0.185185 | 0.172332 |
| 9 | 9 | 0.166667 | 0.140157 |
fig = px.scatter(fakeratio_df,x='Fake ratio using all data',y='Average fake ratio from all kfold runs',
color='Digit',title='Fake ratio using all data vs Average fake ratio from all kfold runs')
fig.update_layout(xaxis = dict(tickmode = 'linear',dtick = 0.02),yaxis = dict(tickmode = 'linear',dtick = 0.02))
fig.show()
We usually focus on getting alot of data to do our training. BUt how many examples of each digit do we really need?
For this exercise, let's do the following:
Plot the average micro accuracy vs sample size for test and training on the same plot.
def cut(cut_size,X,y):
dfcut = pd.DataFrame()
for digit in range(10):
#print("Processing digit ",digit)
index_vals = np.where(y == digit)[0]
df = pd.DataFrame(X[index_vals[:cut_size]])
df['digit'] = digit
dfcut = pd.concat([dfcut, df])
X_cut = dfcut.iloc[:,:784].to_numpy()
y_cut = dfcut.iloc[:,784].values
return X_cut,y_cut
def getmicro_from_cuts(estimator,cut_sizes,X,y,X_holdout,y_holdout):
for i in np.arange(np.size(cut_sizes)):
if i != 0: #starting dataframe exists now and so I can use concat to bring the new stuff together
X_cut,y_cut = cut(cut_sizes[i],X,y)
results_cut = runFitter(estimator,X_cut,y_cut,X_holdout,y_holdout)
microacc = [results_cut['accuracyMicro_train'],results_cut['accuracyMicro_test']]
microacc_data = {'Sample Size':[cut_sizes[i],cut_sizes[i]],
'Average Micro Accuracy': microacc,
'Test Set or Train Set': ['Train','Test']}
microacc_data = pd.DataFrame(microacc_data)
microacc_df = pd.concat([microacc_df,microacc_data])
if i == 0: #here because I need to create the starting dataframe
X_cut,y_cut = cut(cut_sizes[i],X,y)
results_cut = runFitter(estimator,X_cut,y_cut,X_holdout,y_holdout)
microacc = [results_cut['accuracyMicro_train'],results_cut['accuracyMicro_test']]
microacc_data = {'Sample Size':[cut_sizes[i],cut_sizes[i]],
'Average Micro Accuracy': microacc,
'Test Set or Train Set': ['Train','Test']}
microacc_df = pd.DataFrame(microacc_data)
return microacc_df
cuts = [1,10,100,1000,2000,3000,4000,5000]
micro_df = getmicro_from_cuts(estimator,cuts,X,y,X_holdout,y_holdout)
fig = px.scatter(micro_df,x='Sample Size',y='Average Micro Accuracy',
color='Test Set or Train Set',title='Sample Size vs Average Micro Accuracy')
fig.update_layout(xaxis = dict(tickmode = 'linear',dtick = 500),yaxis = dict(tickmode = 'linear',dtick = 0.05))
fig.show()